load libraries

library(ggplot2)
library(dplyr)
library(MultiAssayExperiment) 
library(limma)
library(EnhancedVolcano)
library(DT)
library(RColorBrewer)
library(readr)

Data Loading and Preparation

Data Overview:

Load multiassay .rds file, extract clinical, expression and annotations data; prepare gene expression data for analysis.

# Load your multiassay result and extract clinical data , expression data and annotation

#load mae obj
mae <- readRDS("~/BHK lab/Ravi_Testing/output/ICB_Raviii.rds")

#extract Clinical data 
clin <- data.frame(colData(mae))

#extract the expression data
expr <- assays(mae)[["expr"]]

#extracting the annotation 
annot <- data.frame(rowData(mae@ExperimentList$expr))

# Display first few rows of the dataset
DT::datatable(expr[1:8, 1:4])

Principal Component Analysis (PCA)

Objective: Perform PCA on gene expression data to assess variation across different research centers.

Data Preparation: Following the RNA-seq differential expression analysis section from a paper, the analysis of differentially expressed genes involved filtering for protein-coding transcripts with a minimum expression of log2TPM ≥ 0.5 in at least 30% of samples.

Therefore, steps for PCA-ready gene expression data include:

  1. Restrict expression data to protein-coding genes.
  2. Convert expression data from log2(TPM+0.001) to log2(TPM+1) for consistency.
  3. Remove low/zero expression genes; Keep genes with expression at log2TPM ≥ 0.5 across at least 30% of samples.
# Step 1: Restrict to Protein-Coding Genes
# Filter out non-protein-coding transcripts for focused analysis
annot_proteincoding <- annot[annot$gene_type == "protein_coding",] # 19158 protein coding genes.
expr <- expr[rownames(expr) %in% rownames(annot_proteincoding),]

# Step 2: Normalize Expression Data
# Convert from expr (log2(TPM + 0.001)) to standard log2(TPM + 1) format
expr <- log2((2**expr - 0.001) + 1)

# Step 3: Filter Low/Zero Expression Genes
# threshold for expr >= 0.5 per transcript
thresh <- expr >= 0.5

# Identify and keep transcripts with sufficient expression levels
keep <- rowSums(thresh) >= (0.3 * ncol(expr))
expr <- expr[keep,]  # Dimensions: 15,857 x 148 - including 15k protein-coding

# Display expr
DT::datatable(expr[1:8, 1:4])

Perform PCA

  • Calculate PCA
  • Then Merge PCA and Clinical Data Subset
#1. PCA 

# Transpose expr data for PCA , samples as rows and genes as columns.
expr_t <- t(expr) #samples as rows and genes as columns

# Calculate PCA 
pc <- prcomp(expr_t, center = TRUE, scale. = TRUE)

# Calculate and print the percentage of variance explained by each principal component
var_res <- pc$sdev^2 / sum(pc$sdev^2) * 100
var_res <- round(var_res, 2)  # Round to 2 decimal places

print(var_res)
##   [1] 28.19  5.71  4.15  3.85  3.35  2.99  2.63  2.24  2.10  2.06  1.97  1.73
##  [13]  1.67  1.64  1.44  1.40  1.29  1.18  1.07  1.01  0.95  0.89  0.81  0.78
##  [25]  0.72  0.66  0.64  0.63  0.59  0.57  0.54  0.52  0.50  0.46  0.46  0.45
##  [37]  0.43  0.41  0.41  0.39  0.38  0.37  0.36  0.36  0.34  0.33  0.32  0.31
##  [49]  0.31  0.30  0.29  0.29  0.29  0.28  0.27  0.26  0.26  0.25  0.25  0.25
##  [61]  0.24  0.24  0.23  0.23  0.22  0.22  0.22  0.21  0.21  0.21  0.20  0.20
##  [73]  0.19  0.19  0.19  0.18  0.18  0.17  0.17  0.17  0.17  0.16  0.16  0.16
##  [85]  0.16  0.15  0.15  0.15  0.15  0.15  0.14  0.14  0.13  0.13  0.13  0.13
##  [97]  0.12  0.12  0.12  0.12  0.12  0.12  0.11  0.11  0.11  0.11  0.10  0.10
## [109]  0.10  0.10  0.10  0.10  0.09  0.09  0.09  0.09  0.09  0.08  0.08  0.08
## [121]  0.08  0.08  0.08  0.07  0.07  0.07  0.07  0.07  0.06  0.06  0.06  0.06
## [133]  0.06  0.06  0.06  0.05  0.05  0.05  0.05  0.05  0.04  0.04  0.04  0.04
## [145]  0.03  0.03  0.03  0.00
# 2. Merge PCA and Clinical Subset

# Convert PCA results to a data frame
pcx <- data.frame(pc$x)

# Add patient IDs from row names to the data frame
pcx$patientid <- rownames(pcx)


# Merge PCA data with patient IDs and institutions from clinical data
pcx_merge <- merge(pcx, clin[, c("patientid", "Institution")], by="patientid")

# Set row names to patient IDs
rownames(pcx_merge) <- pcx_merge[,1]

# Removeredundant patient ID column
pcx_merge <- pcx_merge[,-1] 

Visualization

  • Bar Plot : To visualize variance by each principal component.

The bar plot displays PC1 as the longest, explaining the most variance, while subsequent bars (PC2, PC3, etc.) become progressively shorter, indicating diminishing variance.

#To find the largest PC of data

barplot(var_res, main="Bar Plot", xlab="Principal Component", ylab="Percentage of Variance ", col="skyblue")

1. PCA Results Plot

  • Reason :To visualize variance by each principal component.

  • Result :PCA on gene expression data, shows that the ‘Institution’ factor has no significant effect on gene expression data sampling. (no Batch effect has been detected)

# Create labels for the plot
xlab_text <- paste("PC1 (", var_res[1], "%)", sep = "")
ylab_text <- paste("PC2 (", var_res[2], "%)", sep = "")

# Plot PCA results
ggplot(pcx_merge, aes(PC1, PC2, color = Institution)) +
  theme_bw() +
  geom_point() +
  labs(x = xlab_text, y = ylab_text)

2. Limma Approach

  • This approach is used to find differentially expressed genes between responders and non-responders.

Compare Responders and Nonresponders Using Limma Voom:

  1. Analysis with Harmonized_Confirmed_BOR_Bin:
    • Apply limma voom for differential expression using the Harmonized_Confirmed_BOR_Bin from the clinical dataset, aiming to distinguish responders (PR/CR) from nonresponders (SD/PD). This was mentioned in paper method section RNA-seq Differential Expression
  2. Analysis response:
    • Further differential expression analysis is conducted, utilizing the response classification obtained via the Get_Response script.
# Filter expression data based on 'Harmonized_Confirmed_BOR_Bin' column in 'clin'
bor_filtered <- clin$Harmonized_Confirmed_BOR_Bin[!is.na(clin$Harmonized_Confirmed_BOR_Bin)]
expr_bor <- expr[, !is.na(clin$Harmonized_Confirmed_BOR_Bin)] # Remove 12 rows with missing responses

# Create design matrix and fit linear model
design_bor <- model.matrix(~ bor_filtered)
fit_bor <- lmFit(expr_bor, design_bor)

# Perform eBayes analysis and display results
fit_bor <- eBayes(fit_bor)
datatable(topTable(fit_bor))
# Repeat process for 'response' column in 'clin'
response_filtered <- clin$response[!is.na(clin$response)]
expr_response <- expr[, !is.na(clin$response)] # Remove 27 rows with missing responses

# Create design matrix and fit linear model
design_response <- model.matrix(~ response_filtered)
fit_response <- lmFit(expr_response, design_response)

# Perform eBayes analysis and display results
fit_response <- eBayes(fit_response)
datatable(topTable(fit_response))

3. Volcano plot Prepration

  • Preparing Data for volcano Plot

  • Convert fit, both fit_bor and fit_response, to Data frame and add column gene_name symbol

# Retrieve all results from the analysis and convert
volcano_data_res <- topTable(fit_bor, number=Inf)
volcano_data_bor <- topTable(fit_bor, number=Inf)

# Convert the result to a data frame
df_res <- as.data.frame(volcano_data_res)
df_bor <- as.data.frame(volcano_data_bor)

# Subset 'gene_id_no_ver' and 'gene_name' from the gene data
subset_annot <- annot_proteincoding[, c("gene_id_no_ver", "gene_name")]

# Add a 'gene_id' column to the volcano_data
volcano_data_res$gene_id_no_ver <- rownames(volcano_data_res)
volcano_data_bor$gene_id_no_ver <- rownames(volcano_data_bor)

# Merge 'volcano_data' and 'annot_proteincoding' by 'gene_id'
merge_result_res <- merge(volcano_data_res, subset_annot, by = "gene_id_no_ver")
merge_result_bor <- merge(volcano_data_bor, subset_annot, by = "gene_id_no_ver")

# Display the merged result in a table format
datatable(merge_result_res)
datatable(merge_result_bor)

Volcano plot

Fig. 2 in the paper details a Volcano plot from limma voom analysis, displaying two-sided nominal P values. Genes are identified as significantly differentially expressed using cutoffs of |log2(fold change)| > 0.5 and P < 0.05.

# 1.Volcano Plot based on P value:

# For merge_result_res:

EnhancedVolcano(merge_result_res,
    lab = merge_result_res$gene_name,
    x = 'logFC',
    y = 'P.Value',  
    pCutoff = 0.05,
    FCcutoff = 0.5,
    xlim = c(-5, 5),
    ylim = c(0, -log10(10e-4)),
    title= 'Volcano Plot based on P value (based on BOR)'
    )

# For merge_result_res:

EnhancedVolcano(merge_result_bor,
    lab = merge_result_bor$gene_name,
    x = 'logFC',
    y = 'P.Value',
    pCutoff = 0.05,
    FCcutoff = 0.5,
    xlim = c(-5, 5),
    ylim = c(0, -log10(10e-4)),
    title= 'Volcano Plot based on P value (Based on response)'
    )

### 4. Figure 1(a) of paper

  • First 1.(a)_RNA cohort in different institutions.
clin <- data.frame(colData(mae))
clin_subset <-clin[,c("Institution","rna")]


# Counting the occurrences of each institution
institution_counts <- table(clin_subset$Institution)

# Creating the pie chart
pie_chart <- ggplot(clin_subset, aes(x = "", fill = factor(Institution))) +
  geom_bar(width = 1, stat = "count") +
  coord_polar("y", start = 0) +
  theme_void() +
  theme(legend.title = element_blank()) +
  scale_fill_manual(values = c("#999999", "#E69F00", "#56B4E9",
                               "#009E73", "#F0E442", "#0072B2",
                               "#D55E00", "#CC79A7", "#999999",
                               "#999999", "#999999", "#999999",
                               "#999999", "#999999", "#999999"))

# Printing the pie chart
print(pie_chart)

5. Figure 1(b) of paper

Figure 1(b):BOR/recist by PDL1 TPS category.

# Select only the necessary columns
clin <- clin[, c("PDL1_TPS_Description", "recist", "PDL1_TPS")]

# Convert NA values to a specific level
clin$PDL1_TPS_Description <- forcats::fct_explicit_na(clin$PDL1_TPS_Description, na_level = "Unknown")
clin$PDL1_TPS_Description <- factor(clin$PDL1_TPS_Description, levels = c("<1%", "1-49%", ">50%", "Unknown"))

clin$recist <- forcats::fct_explicit_na(clin$recist, na_level = "NE")

# Count the occurrences of combinations of 'PDL1_TPS_Description' and 'recist'
# Note: Do not quote the column names
counted_data <- dplyr::count(clin, PDL1_TPS_Description, recist)

# Group by 'PDL1_TPS_Description' and calculate frequencies
grouped_data <- group_by(counted_data, PDL1_TPS_Description)
data_summary <- mutate(grouped_data, freq = n / sum(n))

# Plotting
ggplot(data_summary, aes(x = PDL1_TPS_Description, y = freq, fill = recist)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "PDL1 TPS", y = "BOR Proportion", fill = "RECIST Category") +
  geom_text(aes(label = scales::percent(freq)), position = position_stack(vjust = 0.5)) +
  theme_minimal()